import os
import time
import requests
from bs4 import BeautifulSoup
from urllib.request import urlretrieve, build_opener, install_opener
import xlwt


headers = {
    'Cookie': r'Hm_lvt_8e27732e26e78ee7975a6f697a0d3bbf=1677850357,1678426339,1678680217,1679478512; zh_choose=n; arialoadData=false',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}

req = requests.get(r'http://audit.sz.gov.cn/zxbs/sjgzbg/bjyszxbg/index.html', headers=headers)
soup = BeautifulSoup(req.text, features='lxml')
lis = soup.select('.right_list ul li')
if not os.path.exists('files'):
    os.makedirs('files')

info = xlwt.Workbook()
sheet = info.add_sheet('本级预算执行和其他财政收支的审计工作报告')
first_rows = ['标题', '来源', '日期', '链接', '内容']
for i in range(len(first_rows)):
    sheet.write(0, i, first_rows[i])

index = 1
for li in lis:
    date = li.select('span')[0].text
    link = li.select('a')[0]['href']
    print(link)
    if link.endswith('.pdf'):
        tit = li.select('a')[0].text.strip()
        opener = build_opener()
        opener.addheaders = [(k, v) for k, v in headers.items()]
        install_opener(opener)
        urlretrieve(link, os.path.join('files', tit))
    req2 = requests.get(link, headers=headers)
    soup2 = BeautifulSoup(req2.text, features='lxml')
    title = soup2.select('div.tit > h1')[0].text
    source = soup2.select('div.tit > h6 #ly')[0].text
    contents = soup2.select('div.news_cont_d_wrap')[0].text
    downloads = soup2.select('div.x866 dd>a')
    for attachment in downloads:
        tit = attachment.text.strip()
        link2 = attachment['href']
        print(tit, link2)
        opener = build_opener()
        opener.addheaders = [(k, v) for k, v in headers.items()]
        install_opener(opener)
        urlretrieve(link2, os.path.join('files', tit))
    print(title, source, date, link)
    print(contents)
    sheet.write(index, 0, title)
    sheet.write(index, 1, source)
    sheet.write(index, 2, date)
    sheet.write(index, 3, link)
    sheet.write(index, 4, contents.strip())
    index += 1
    time.sleep(1)

info.save('本级预算执行和其他财政收支的审计工作报告2.xls')
